---
title: "Calculate Cosine Similarity Scores"
output: html_document
---

```{r setup, include=FALSE}
getwd()
library(tm)
library(dplyr)
library(tidyverse)
library(lolog) 
library(network)
library(dotwhisker)
library(magrittr)
library(gridExtra)
library(quanteda.textstats)
library(quanteda) 
library(pdftools)
library(tm)
library(igraph)
```
# 02.19.09
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/02.19.09 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment <- read.csv("data/comments_max_cos.csv")

comment$name <- tolower(comment$name)

comment_1 <- comment %>% 
  filter(comment_number == "1") %>% 
  arrange(name)

feb_19_2009 <- cbind(comment_1, final_sim, word_lengths, readability, matching)
```

# 10.01.13-1
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/10.01.13-1 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text))
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]

```

```{r}
comment_3 <- comment %>% 
  filter(comment_number == "3") %>% 
  arrange(name)

october_1_13_1 <- cbind(comment_3, final_sim, word_lengths, readability, matching)
```

# 10.01.13-2
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/10.01.13-2 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>% summarize(text = paste(text, collapse = ", ")) %>%   
  mutate(text = as.character(text)) %>% mutate(file = as.character(file)) %>% mutate(file = trimws(file))

df_c$file <- tolower(df_c$file)

df_c <- df_c %>% 
  ungroup() %>% 
  arrange(file)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]

```

```{r}
comment_2 <- comment %>% 
  filter(comment_number == "2") %>% 
  arrange(name)

october_1_13_2 <- cbind(comment_2, final_sim, word_lengths, readability, matching)
```

#12.22.13
```{r}
pdf_files <- list.files("final-frameworks/12.22.13 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_4 <- comment %>% 
  filter(comment_number == "4") %>% 
  arrange(name)

december_22_13 <- cbind(comment_4, final_sim, word_lengths, readability, matching)
```

# January 15, 2014 1
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/01.15.14-1 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]

```

```{r}
comment_5 <- comment %>% 
  filter(comment_number == "6") %>% 
  arrange(name)

comment_5 %>% 
  select(name)

jan_1_14 <- cbind(comment_5, final_sim, word_lengths, readability, matching)
```
# Jan 15, 2014 2
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/01.15.14-2 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_6 <- comment %>% 
  filter(comment_number == "5") %>% 
  arrange(name)

jan_1_14_2 <- cbind(comment_6, final_sim, word_lengths, readability, matching)
```

# Feb 23, 2014
```{r}
pdf_files <- list.files("final-frameworks/02.23.14 copy", full.names = TRUE, pattern = ".pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')

for_token <- as.character(df_c$text)

df_c <- df %>%
  group_by(file) %>% 
  summarize(text = paste(text, collapse = ", ")) %>% 
  mutate(text = as.character(text)) %>% 
  mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%
  arrange(file_number)

toks <- tokens(df_c$text,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_7 <- comment %>% 
  filter(comment_number == "7") %>% 
  arrange(name)

feb_23_14 <- cbind(comment_7, final_sim, word_lengths, readability, matching)
```
# April 9, 2014
```{r}
pdf_files <- list.files("final-frameworks/04.09.14 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_8 <- comment %>% 
  filter(comment_number == "8") %>% 
  arrange(name)

april_9_14 <- cbind(comment_8, final_sim, word_lengths, readability, matching)
```

# April 11, 2014
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/04.11.14 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

df_c$file

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_9 <- comment %>% 
  filter(comment_number == "9") %>% 
  arrange(name)

april_11_14 <- cbind(comment_9, final_sim, word_lengths, readability, matching)
```

# April 14, 2014
```{r}
pdf_files <- list.files("final-frameworks/04.14.14 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_10 <- comment %>% 
  filter(comment_number == "10") %>% 
  arrange(name)

april_14_14 <- cbind(comment_10, final_sim, word_lengths, readability, matching)
```
# May 2, 2014
```{r}
pdf_files <- list.files("final-frameworks/05.02.14 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]

```

```{r}
comment_11 <- comment %>% 
  filter(date == "201452") %>% 
  arrange(name)

may_2_14 <- cbind(comment_11, final_sim, word_lengths, readability, matching)
```

# September 19, 2014
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/09.19.14 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_12 <- comment %>% 
  filter(comment_number == "12") %>% 
  arrange(name)

sep_19_14 <- cbind(comment_12, final_sim, word_lengths, readability, matching)
```
# January 9, 2015 1
```{r}
pdf_files <- list.files("final-frameworks/01.09.15-1 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_14 <- comment %>% 
  filter(comment_number == "14") %>% 
  arrange(name)

jan_9_15_1 <- cbind(comment_14, final_sim, word_lengths, readability, matching)
```
# January 9, 2015 2
```{r}
pdf_files <- list.files("final-frameworks/01.09.15-2 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_13 <- comment %>% 
  filter(comment_number == "13") %>% 
  arrange(name)

jan_9_15_2 <- cbind(comment_13, final_sim, word_lengths, readability, matching)
```
# January 14, 2015
```{r}
pdf_files <- list.files("final-frameworks/01.14.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_15 <- comment %>% 
  filter(comment_number == "15") %>% 
  arrange(name)

jan_14_15 <- cbind(comment_15, final_sim, word_lengths, readability, matching)
```

# January 16, 2015
```{r}
pdf_files <- list.files("final-frameworks/01.16.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_16 <- comment %>% 
  filter(comment_number == "16") %>% 
  arrange(name)

jan_16_15 <- cbind(comment_16, final_sim, word_lengths, readability, matching)
```

# February 6, 2015 - 1
```{r}
pdf_files <- list.files("final-frameworks/02.06.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

head(df_c)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_20 <- comment %>% 
  filter(comment_number == "20") %>% 
  arrange(name)

feb_6_15_1 <- cbind(comment_20, final_sim, word_lengths, readability, matching)

feb_6_15_1 %>% 
  select(name, final_sim)
```

# February 6, 2015 2
```{r}
pdf_files <- list.files("final-frameworks/02.06.15-2 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]
df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_19 <- comment %>% 
  filter(comment_number == "19") %>% 
  arrange(name)

feb_6_15_2 <- cbind(comment_19, final_sim, word_lengths, readability, matching)
```

# February 6, 2015 3
```{r}
pdf_files <- list.files("final-frameworks/02.06.15-3 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]
df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_18 <- comment %>% 
  filter(comment_number == "18") %>% 
  arrange(name)

feb_6_15_3 <- cbind(comment_18, final_sim, word_lengths, readability, matching)
```

# February 6, 2015 4
```{r}
pdf_files <- list.files("final-frameworks/02.06.15-4 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_17 <- comment %>% 
  filter(comment_number == "17") %>% 
  arrange(name)

feb_6_15_4 <- cbind(comment_17, final_sim, word_lengths, readability, matching)
```

# February 20, 2015
```{r}
pdf_files <- list.files("final-frameworks/02.20.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_21 <- comment %>% 
  filter(comment_number == "21") %>% 
  arrange(name)

feb_20_15 <- cbind(comment_21, final_sim, word_lengths, readability, matching)
```
# April 30, 2015

```{r}
pdf_files <- list.files("final-frameworks/04.30.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_22 <- comment %>% 
  filter(comment_number == "22") %>% 
  arrange(name)

april_30_15 <- cbind(comment_22, final_sim, word_lengths, readability, matching)
```
# May 1, 2015
```{r}
pdf_files <- list.files("final-frameworks/05.01.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_23 <- comment %>% 
  filter(comment_number == "23") %>% 
  arrange(name)

may_1_15 <- cbind(comment_23, final_sim, word_lengths, readability, matching)
```
# May 8, 2015
```{r}
pdf_files <- list.files("final-frameworks/05.08.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_24 <- comment %>% 
  filter(comment_number == "24") %>% 
  arrange(name)

may_8_15 <- cbind(comment_24, final_sim, word_lengths, readability, matching)
```
# May 29, 2015
```{r}
pdf_files <- list.files("final-frameworks/05.29.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_25 <- comment %>% 
  filter(comment_number == "25") %>% 
  arrange(name)

may_29_15 <- cbind(comment_25, final_sim, word_lengths, readability, matching)
```

# June 12, 2015
```{r}
pdf_files <- list.files("final-frameworks/06.12.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_26 <- comment %>% 
  filter(comment_number == "26") %>% 
  arrange(name)

june_12_15 <- cbind(comment_26, final_sim, word_lengths, readability, matching)
```
# June 17, 2015
```{r}
pdf_files <- list.files("final-frameworks/06.17.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_27 <- comment %>% 
  filter(comment_number == "27") %>% 
  arrange(name)

june_17_15 <- cbind(comment_27, final_sim, word_lengths, readability, matching)
```
# June 18, 2015
```{r}
pdf_files <- list.files("final-frameworks/06.18.15 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_28 <- comment %>% 
  filter(comment_number == "28") %>% 
  arrange(name)

june_18_15 <- cbind(comment_28, final_sim,word_lengths, readability, matching)
```

# April 1, 2016
```{r}
pdf_files <- list.files("final-frameworks/04.01.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_48 <- comment %>% 
  filter(comment_number == "48") %>% 
  arrange(name)

april_1_16 <- cbind(comment_48, final_sim, word_lengths, readability, matching)
```
# April 22, 2016
```{r}
pdf_files <- list.files("final-frameworks/04.22.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]


df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_49 <- comment %>% 
  filter(comment_number == "49") %>% 
  arrange(name)

april_22_2016 <- cbind(comment_49, final_sim, word_lengths, readability, matching)
```

# June 30, 2016
```{r}
pdf_files <- list.files("final-frameworks/06.30.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_29 <- comment %>% 
  filter(comment_number == "29") %>% 
  arrange(name)

june_30_16 <- cbind(comment_29, final_sim, word_lengths, readability, matching)
```
# August 16, 2016 1
```{r}
pdf_files <- list.files("final-frameworks/08.16.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]

```

```{r}
comment_50 <- comment %>% 
  filter(comment_number == "50") %>% 
  arrange(name)

aug_16_16 <- cbind(comment_50, final_sim, word_lengths, readability, matching)
```

# August 16, 2016 2
```{r}
pdf_files <- list.files("final-frameworks/08.16.16-2 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)
cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_51 <- comment %>% 
  filter(comment_number == "51") %>% 
  arrange(name)

aug_16_16_2 <- cbind(comment_51, final_sim, word_lengths, readability, matching)

```							

# September 5, 2016 1
```{r}
pdf_files <- list.files("final-frameworks/09.05.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_52 <- comment %>% 
  filter(comment_number == "52") %>% 
  arrange(name)

sep_5_16 <- cbind(comment_52, final_sim, word_lengths, readability, matching)
```

# September 5, 2016 2
```{r}
pdf_files <- list.files("final-frameworks/09.05.16-2 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_53 <- comment %>% 
  filter(comment_number == "53") %>% 
  arrange(name)

sep_5_16_2 <- cbind(comment_53, final_sim, word_lengths, readability, matching)
```

# September 8, 2016
```{r}
pdf_files <- list.files("final-frameworks/09.08.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)
cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_54 <- comment %>% 
  filter(comment_number == "54") %>% 
  arrange(name)

sep_8_16 <- cbind(comment_54, final_sim, word_lengths, readability, matching)
```

# September 19, 2016
```{r}
pdf_files <- list.files("final-frameworks/09.19.16 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_55 <- comment %>% 
  filter(comment_number == "55") %>% 
  arrange(name)

sep_19_16 <- cbind(comment_55, final_sim, word_lengths, readability, matching)
```
# February 3, 2017
```{r}
pdf_files <- list.files("final-frameworks/02.03.17 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_56 <- comment %>% 
  filter(comment_number == "56") %>% 
  arrange(name)

feb_3_17 <- cbind(comment_56, final_sim, word_lengths, readability, matching)
```

# June 30, 2017
```{r}
pdf_files <- list.files("final-frameworks/06.30.17 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_57 <- comment %>% 
  filter(comment_number == "57") %>% 
  arrange(name)

june_30_17 <- cbind(comment_57, final_sim, word_lengths, readability, matching)
```

# August 10, 2017 
```{r}
pdf_files <- list.files("final-frameworks/08.10.17 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_30 <- comment %>% 
  filter(comment_number == "30") %>% 
  arrange(name)

aug_10_17 <- cbind(comment_30, final_sim, word_lengths, readability, matching)
```
# September 15, 2017 1
```{r}
pdf_files <- list.files("final-frameworks/09.15.17-1 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_31 <- comment %>% 
  filter(comment_number == "31") %>% 
  arrange(name)

sep_15_17_1 <- cbind(comment_31, final_sim, word_lengths, readability, matching)
```


# September 15, 2017 2
```{r}
pdf_files <- list.files("final-frameworks/09.15.17-2 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_32 <- comment %>% 
  filter(comment_number == "32") %>% 
  arrange(name)

sep_19_17_2 <- cbind(comment_32, final_sim, word_lengths, readability, matching)
```

# October 13, 2017
```{r}
pdf_files <- list.files("final-frameworks/10.13.17 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_34 <- comment %>% 
  filter(comment_number == "34") %>% 
  arrange(name)

oct_13_17 <- cbind(comment_34, final_sim, word_lengths, readability, matching)
```

# October 20, 2017
```{r}
pdf_files <- list.files("final-frameworks/10.20.17 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_35 <- comment %>% 
  filter(comment_number == "35") %>% 
  arrange(name)

oct_20_17 <- cbind(comment_35, final_sim, word_lengths, readability, matching)
```

# January 15, 2018
```{r}
pdf_files <- list.files("final-frameworks/01.15.18 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_36 <- comment %>% 
  filter(comment_number == "36") %>% 
  arrange(name)

jan_15_18 <- cbind(comment_36, final_sim, word_lengths, readability, matching)
```


# June 20, 2018 1
```{r}
pdf_files <- list.files("final-frameworks/06.20.18-1 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_38 <- comment %>% 
  filter(comment_number == "38") %>% 
  arrange(name)

june_20_18_1 <- cbind(comment_38, final_sim, word_lengths, readability, matching)
```

# June 20, 2018 2
```{r}
pdf_files <- list.files("final-frameworks/06.20.18-2 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]

```

```{r}
comment_37 <- comment %>% 
  filter(comment_number == "37") %>% 
  arrange(name)

june_20_18 <- cbind(comment_37, final_sim, word_lengths, readability, matching)
```

# July 6, 2018
```{r}
pdf_files <- list.files("final-frameworks/07.06.18 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```


```{r}
comment_58 <- comment %>% 
  filter(comment_number == "58") %>% 
  arrange(name)

july_6_18 <- cbind(comment_58, final_sim, word_lengths, readability, matching)
```


# September 7, 2018
```{r}
pdf_files <- list.files("final-frameworks/09.07.18 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_39 <- comment %>% 
  filter(comment_number == "39") %>% 
  arrange(name)

sep_7_18 <- cbind(comment_39, final_sim, word_lengths, readability, matching)
```

# March 6, 2019
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/03.06.19 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_40 <- comment %>% 
  filter(date == "201936") %>% 
  arrange(name)

mar_6_19 <- cbind(comment_40, final_sim, word_lengths, readability, matching)
```

# May 10, 2019
```{r}
pdf_files <- list.files("final-frameworks/05.10.19 copy", full.names = TRUE, pattern = "\\.pdf$", recursive = FALSE)


part_numbers <- as.integer(str_extract(pdf_files, "(?<=Part)\\d+"))


custom_sort <- function(files, parts) {
  order(parts)
}


pdf_files <- pdf_files[custom_sort(pdf_files, part_numbers)]


pdf_corpus <- Corpus(URISource(pdf_files), readerControl = list(reader = readPDF))


pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_41 <- comment %>% 
  filter(date == "2019510") %>% 
  arrange(name)

may_10_19 <- cbind(comment_41, final_sim, word_lengths, readability, matching)
```
# November 12, 2019
```{r}
<- <- Corpus(DirSource("final-frameworks/11.12.19 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')

df_c <- df %>%   group_by(file) %>% summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text))

df_c$file

df_c$file <- tolower(df_c$file)

df_c <- df_c %>% 
  arrange(file)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_42 <- comment %>% 
  filter(date == "20191112") %>% 
  arrange(name)

nov_12_19 <- cbind(comment_42, final_sim, word_lengths, readability, matching)

nov_12_19 %>% 
  select(name, final_sim)
```

# December 2, 2019
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/12.02.19 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df$file <- tolower(df$file)
df_c <- df %>% 
  arrange(file)
df_c <- df_c %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text))

df_c

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_43 <- comment %>% 
  filter(date == "2019122") %>% 
  arrange(name)

dec_2_19 <- cbind(comment_43, final_sim, word_lengths, readability, matching)
```
# March 6, 2020
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/03.06.20 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```


```{r}
comment_44 <- comment %>% 
  filter(comment_number == "44") %>% 
  arrange(name)

march_6_2020 <- cbind(comment_44, final_sim, word_lengths, readability, matching)
```

# April 13, 2020
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/04.13.20 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)
for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))



final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_45 <- comment %>% 
  filter(comment_number == "45") %>% 
  arrange(name)

april_13_2020 <- cbind(comment_45, final_sim, word_lengths, readability, matching)
```
# December 14, 2020
```{r}
pdf_corpus <- Corpus(DirSource("final-frameworks/12.14.20 copy"), readerControl = list(reader = readPDF))

pdf_text <- sapply(pdf_corpus, as.character)

count_words <- function(text) {
  
  words <- unlist(strsplit(text, "\\s+"))
  
  num_words <- length(words)
  return(num_words)
}

word_lengths <- sapply(pdf_text, count_words)[-1]

df <- data.frame(unlist(pdf_text))
df$files <- rownames(df)
df$files <- sapply(strsplit(df$files, ".p"), FUN = "[[", 1)
names(df) <- c('text','file')
df_c <- df %>%   group_by(file) %>%    summarize(text = paste(text, collapse = ", ")) %>%    mutate(text = as.character(text)) %>%    mutate(file_number = as.numeric(sub(".*Part(\\d+).*", "\\1", file))) %>%   arrange(file_number)
df_c$file <- tolower(df_c$file)
df_c <- df_c %>% 
  arrange(file)

for_token <- as.character(df_c$text)

toks <- tokens(for_token,
               remove_punct = TRUE,
               remove_twitter=TRUE, # twitter means @ & #
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               ngrams=1)

toks <- tokens_tolower(toks)

toks5 <- dfm(tokens_ngrams(toks, n=5))

toks_dfm <- dfm_wordstem(toks5)

cos_sim <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "cosine")

matching <- textstat_simil(toks_dfm,
                          y = toks_dfm[1, ],
                          method = "jaccard")


readability <- as.matrix(textstat_readability(for_token,
                                              measure = "Flesch",
                                              remove_hyphens = TRUE
))





final_sim <- as.numeric(cos_sim)[-1]

matching <- as.numeric(matching)[-1]


readability <- readability[-1,2]
```

```{r}
comment_46 <- comment %>% 
  filter(comment_number == "46") %>% 
  arrange(name)

dec_14_2020 <- cbind(comment_46, final_sim, word_lengths, readability, matching)

dec_14_2020 %>% 
  select(name, final_sim)
```

# combine all of the files into one large dataset with cosine sim scores
```{r}
final_text_analysis <- rbind(as.data.frame(feb_19_2009), as.data.frame(october_1_13_1), as.data.frame(october_1_13_2), as.data.frame(december_22_13), as.data.frame(jan_1_14), as.data.frame(jan_1_14_2), as.data.frame(feb_23_14), as.data.frame(april_9_14), as.data.frame(april_11_14),
as.data.frame(april_14_14), as.data.frame(may_2_14), as.data.frame(sep_19_14),
as.data.frame(jan_9_15_1), as.data.frame(jan_9_15_2), as.data.frame(jan_14_15),
as.data.frame(jan_16_15), as.data.frame(feb_6_15_1), as.data.frame(feb_6_15_2),
as.data.frame(feb_6_15_3), as.data.frame(feb_6_15_4), as.data.frame(feb_20_15),
as.data.frame(april_30_15), as.data.frame(may_1_15), as.data.frame(may_8_15),
as.data.frame(may_29_15), as.data.frame(june_12_15), as.data.frame(june_17_15),
as.data.frame(june_18_15), as.data.frame(april_1_16), as.data.frame(april_22_2016),
as.data.frame(june_30_16), as.data.frame(aug_16_16), as.data.frame(aug_16_16_2),
as.data.frame(sep_5_16), as.data.frame(sep_5_16_2), as.data.frame(sep_8_16),
as.data.frame(sep_19_16), as.data.frame(feb_3_17), as.data.frame(june_30_17),
as.data.frame(aug_10_17), as.data.frame(sep_15_17_1), as.data.frame(sep_19_17_2),
as.data.frame(oct_13_17), as.data.frame(oct_20_17), as.data.frame(jan_15_18),
as.data.frame(june_20_18_1), as.data.frame(june_20_18), as.data.frame(july_6_18), 
as.data.frame(sep_7_18), as.data.frame(mar_6_19), as.data.frame(may_10_19), as.data.frame(nov_12_19),
as.data.frame(dec_2_19), as.data.frame(march_6_2020),
as.data.frame(april_13_2020), as.data.frame(dec_14_2020))

write.csv(final_text_analysis, "final_text.csv")
```


